Language codes (from WALS) – WALS, ISO, ascii-name. These are used to merge across datasets.
codes = read.csv("../data/language_codes.csv") %>%
select(language, WALS, ISO) %>%
mutate(ISO = unlist(lapply(strsplit(as.character(ISO),
","),function(x) x[1])))
# in cases where there are two ISO codes, takes first
Lupyan & Dale (2010): Demographic variables and syntactic compelxity
ld = read.table("../data/lupyan_2010.txt", fill = T,
header = T, sep = "\t", na.strings = "*") %>%
left_join(codes, c("walsCode" = "WALS")) %>%
rename(pop_log = logpop2,
mean.temp = aveTemp,
n.neighbors = numNeighbors,
sum.precip = sumPrecip,
sd.temp = sdTemp,
sd.precip = sdPrecip,
lang.family = langFamily,
lang.genus = langGenus,
native.country = nativeCountry,
native.country.area = nativeCountryArea,
growing.season = growingSeason)
# get means across language
ld.demo = ld %>%
group_by(ISO) %>%
summarise_each(funs(mean(., na.rm = TRUE)),
c(8:9, 16:121)) %>%
select(1:3, 93:95, 100:101, 103:105, 108)
# add in data family
fams = ld %>%
group_by(ISO) %>%
filter(row_number() == 1) %>%
select(ISO, lang.family, lang.genus, native.country,
native.country.area, language)
# were getting country here from choosing the most frequent country for a language
fams.country = ld %>%
group_by(ISO) %>%
summarise(country = most.frequent.level(country)) %>%
left_join(fams)
ld.demo = left_join(fams.country, ld.demo, by = "ISO") %>%
ungroup()
WALS features of complexity
# get variables to include
qual = ld %>%
select(18:103, 124) %>%
mutate_each(funs(as.factor)) %>%
group_by(ISO) %>%
summarise_each(funs(most.frequent.level)) %>%
mutate_each(funs(as.factor))
ld_feature_names = read.csv("../data/lupyan_2010_all_feature_mappings.csv")
qualVarNames = intersect(names(qual), ld_feature_names$WALS.feature.name)
qual = select(qual,which(is.element(names(qual), c("ISO", qualVarNames))))
# remap factor levels to complexity values (0,1)
# note two variables that are reported in the paper (NICPOC[59] and CYSIND[39] are missing in the data)
for (i in 1:length(qualVarNames)){
thisVarLabs = ld_feature_names[ld_feature_names$WALS.feature.name == qualVarNames[i],]
old = thisVarLabs$ld.level.label
new = thisVarLabs$ld.complexity
col_i = grep(qualVarNames[i], colnames(qual))
qual[,col_i] = mapvalues(as.matrix(qual[,col_i]),
from = as.character(old),
to = as.character(new), warn_missing = TRUE)
}
ld.complexity = qual %>%
mutate_each(funs(as.numeric), -ISO) %>%
gather(variable, complexity.level, 2:28) %>%
group_by(ISO) %>%
summarise(morphological.complexity = sum(complexity.level))
ld.demo.qual = left_join(ld.demo, ld.complexity)
Bentz, et al. (2015): L2 learners and lexical diversity Note here that we are only looking at languages from the UDHR corpus. The other corpora have smaller languages only.
bentz = read.csv("../data/bentz_2015.csv") %>%
gather(temp, LDT, starts_with("LDT")) %>%
unite(temp1, measure, temp, sep = ".") %>%
spread(temp1, LDT) %>%
filter(text == "UDHR") %>%
select(iso_639_3, RatioL2, a.LDTscaled,
H.LDTscaled, TTR.LDTscaled, Stock,
Region, L1_speakers, L2_speakers) %>%
rename(ISO = iso_639_3,
n.L1.speakers = L1_speakers,
n.L2.speakers = L2_speakers,
ratio.L2.L1 = RatioL2,
stock = Stock,
region = Region,
scaled.LDT.TTR = TTR.LDTscaled,
scaled.LDT.ZM = a.LDTscaled,
scaled.LDT.H = H.LDTscaled) %>%
left_join(codes, by="ISO") %>%
mutate(ISO = as.factor(ISO)) %>%
select(-language, -WALS)
Atkinson (2011)
atkinson = read.csv("../data/atkinson_2011.csv") %>%
select(normalized.vowel.diversity,
normalized.consonant.diversity,
normalized.tone.diversity,
normalized.phoneme.diversity, ISO,
distance.from.origin) %>%
mutate(ISO = unlist(lapply(strsplit(as.character(ISO),
" "),function(x) x[1]))) %>%
left_join(codes, by="ISO") %>%
select(-language, -WALS)
Lewis & Frank (under review): Complexity Bias
# complexity bias
cb = read.csv("../data/lewis_2015.csv") %>%
rename(complexity.bias = corr,
p.complexity.bias = p.corr,
mono.complexity.bias = mono.cor,
open.complexity.bias = open.cor) %>%
left_join(codes, by="language") %>%
select(-X.1, -X, -lower.ci, -upper.ci, -checked,
-mean.length) %>%
distinct(ISO) %>%
filter(language != "english") %>% # english is an outlier in this dataset because norms colelcted in english
select(-language, -WALS)
Futrell, Mahowald, & Gibson (2015): Dependency length
dl = read.csv("../data/futrell_2015.csv") %>%
left_join(codes, by="language") %>%
select(-language, -WALS, -fixed.random.baseline.slope,
-observed.slope, -m_rand) %>%
rename(mean.dependency.length = m_obs)
Pellegrino, Coupe, & Marsico (2015): Information density
uid = read.csv("../data/pellegrino_2015.csv") %>%
left_join(codes, by="language") %>%
select(-language, -WALS)
Moran, McCloy, & Wright (2012): Phonemic inventory
phoneme = read.csv("../data/moran_2012.csv") %>%
select(ISO, pho, con, vow, son,
obs, mon, qua, ton) %>%
rename(n.phonemes = pho,
n.consonants = con,
n.vowels = vow,
n.sonorants = son,
n.obstruents = obs,
n.monophthongs = mon,
n.qual.monophthongs = qua,
n.tones = ton)
Wichmann, et al. (2013): Mean word length
# note is this IPA? - fix this!
ml = read.csv("../data/wichmann_2013.csv") %>%
select(1,1:109) %>%
gather(word,translation,I:name) %>%
mutate(nchar = unlist(
lapply(
lapply(
strsplit(
gsub("[[:space:]]", "", translation) ,
","),
nchar), mean))) %>%
filter(translation != "")
# subset to only those words in the swadesh list (n = 40)
swadesh.words = ml[ml$ISO == "gwj", "word"]
ml = ml %>%
filter(is.element(word, swadesh.words))
ml.d = ml %>%
group_by(ISO) %>%
summarize(mean.length = mean(nchar, na.rm = T))
Luniewska, et al. (2015): Age of acquistion (Aoa)
aoa.raw = read.csv("../data/luniewska_2015.csv",
header = T, fileEncoding = "latin1")
aoa = aoa.raw %>%
gather(language_aoa, aoa, grep("aoa",
names(aoa.raw))) %>%
mutate(language_aoa = tolower(unlist(lapply(strsplit(
as.character(language_aoa),"_"), function(x) x[1])))) %>%
select(-3:-27) %>%
rename(language = language_aoa) %>%
left_join(codes, by="language") %>%
group_by(ISO) %>%
summarize(mean.aoa = mean(aoa, na.rm = T))
Merge data frames together by ISO
d = full_join(ld.demo.qual, cb, by = "ISO") %>%
full_join(bentz, by = "ISO") %>%
full_join(dl, by = "ISO") %>%
full_join(uid, by = "ISO") %>%
full_join(phoneme, by = "ISO") %>%
full_join(ml.d, by = "ISO") %>%
full_join(aoa, by = "ISO") %>%
full_join(atkinson, by = "ISO") %>%
distinct(ISO) %>%
filter(!is.na(ISO), ISO !="") %>%
mutate(ISO = as.factor(ISO))
#write.csv(d, "../data/langLearnVar_data.csv")
load data
d = read.csv("../data/langLearnVar_data.csv")[,-1]
Remove outliers and check for normality
d.clean = d %>%
mutate_each(funs(removeOutlier(.,3)), c(-ISO, -lang.family,
-native.country, -native.country.area,
-lang.genus, -language, -stock, -region, -country))
d.clean %>%
select(-ISO, -lang.family, -native.country, -native.country.area,
-lang.genus, -language, -stock, -region, -country) %>%
gather(var, value) %>%
ggplot(aes(sample = value)) +
stat_qq(na.rm = T, size = .2) +
facet_wrap( ~ var, scales = "free") +
theme_bw()
The following variables look right-skewed.
NN_vars = c("area", "perimeter", "n.neighbors", "sd.precip",
"ratio.L2.L1", "n.L1.speakers", "n.L2.speakers",
"n.phonemes", "n.consonants","n.vowels", "n.sonorants",
"n.obstruents", "n.monophthongs", "n.qual.monophthongs",
"n.tones")
Take the log of these variables, and check for normality again.
d.clean = d.clean %>%
mutate_each(funs(log = log(.), dummy = sum(.)),
one_of(NN_vars)) %>%
select(-contains("dummy")) %>%
select(-one_of(NN_vars))
d.clean %>%
select(-ISO, -lang.family, -native.country, -native.country.area,
-lang.genus, -language, -stock, -region, -country) %>%
gather(var, value) %>%
ggplot(aes(sample = value)) +
stat_qq(na.rm = T, size = .2) +
facet_wrap( ~ var, scales = "free") +
theme_bw()
Check for normality of variables using Shapiro-Wilk Normality Test
is.na(d.clean) <- sapply(d.clean, is.infinite)
normality.test = d.clean %>%
summarise_each(funs(unlist(tidy(shapiro.test(.))[2])),
c(-ISO, -lang.family, -native.country,
-native.country.area, -country,
-lang.genus, -language, -stock, -region)) %>%
gather(variable, shapiro.p.value) %>%
mutate(normal = ifelse(shapiro.p.value > .05, TRUE, FALSE)) %>%
summary()
By this test, only 14 variables normal. But, I think this is the best we can do (?).
Look at histograms of all variables.
demo_vars = c("lat", "lon", "perimeter_log", "n.neighbors_log",
"mean.temp", "sum.precip",
"sd.temp", "growing.season", "pop_log", "area_log",
"n.neighbors_log", "sd.precip_log", "ratio.L2.L1_log",
"n.L1.speakers_log",
"n.L2.speakers_log", "distance.from.origin")
lang_vars = c(setdiff(names(d.clean), demo_vars))[-c(1:6, 16,17)]
d.clean %>%
gather(variable, num, lat:length(d.clean)) %>%
filter(variable != "stock", variable != "region") %>%
mutate(var_type = ifelse(is.element(variable, demo_vars),
"demo", "lang"),
num = as.numeric(as.character(num))) %>%
arrange(var_type) %>%
mutate(variable = factor(variable, variable)) %>%
ggplot(aes(x = num, fill = var_type)) +
geom_histogram(position = "identity") +
facet_wrap( ~ variable, scales = "free") +
theme_bw()
Number of variables we have data for, for each language
# as is this plot is too hard to read to be useful
d.clean %>%
gather(variable, num, c(7:21, 23:48)) %>%
group_by(language) %>%
summarize(counts = length(which(is.na(num) == F))) %>%
filter(!is.na(counts), counts > 10, !is.na(language)) %>%
ggplot(aes(y = counts, x = language)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("Number of variables (minimum 10)") +
ggtitle("Number variables per language")
Number of languages we have data, for each variable
d.clean %>%
gather(variable, num, lat:length(d.clean)) %>%
filter(variable != "stock", variable != "region") %>%
group_by(variable) %>%
summarize(counts = length(which(is.na(num) == F))) %>%
mutate(var_type = ifelse(is.element(variable, demo_vars),
"demo", "lang")) %>%
mutate(counts_trunc = ifelse(counts > 150, 150, counts)) %>%
arrange(var_type) %>%
mutate(variable = factor(variable, variable)) %>%
ggplot(aes(y=counts_trunc, x = variable, fill = var_type)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none") +
ylab("Number of languages (truncated at 150)") +
ggtitle("Number languages per measure")
Look at number of languages intersecting each variable
arealControl_vars = c("stock", "region", "lang.family", "lang.genus",
"native.country", "native.country.area",
"country")
d.clean %>%
gather(lang_measure, lang_value,
which(is.element(names(.), c(lang_vars, arealControl_vars)))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars))) %>%
group_by(lang_measure, pop_measure) %>%
mutate(both_not_na = !is.na(lang_value) & !is.na(pop_value)) %>%
summarise(n_languages = length(which(both_not_na == TRUE))) %>%
ggplot(aes(pop_measure, lang_measure, group = lang_measure)) +
geom_tile(aes(fill = n_languages)) +
geom_text(aes(fill = n_languages, label = n_languages)) +
scale_fill_gradient(low = "white", high = "red") +
xlab("population variables") +
ylab("language variables") +
ggtitle("Number languages between variables") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Correlation matrix (stuff in the upper right quadrant matters)
is.na(d) <- sapply(d, is.infinite)
corrs = cor(d[,-c(1:3, 36)], use = "pairwise.complete.obs")
corrplot(corrs)
Scatter plots
Fits controling for family – controlling for family in a mixed effect model
# these are the variables we care about for the paper
demo_vars_crit = c("n.neighbors_log",
"mean.temp", "sum.precip", "sd.temp",
"pop_log", "area_log", "n.neighbors_log",
"sd.precip_log", "ratio.L2.L1_log",
"distance.from.origin")
lang_vars_crit = c("p.complexity.bias","scaled.LDT.TTR",
"mean.dependency.length",
"mean.length", "mean.aoa",
"n.consonants_log", "n.vowels_log", "morphological.complexity")
d.scatter.fam = d.clean %>%
filter(pop_log>2) %>%
select(ratio.L2.L1_log, pop_log, distance.from.origin,
n.neighbors_log, area_log, mean.temp, sd.temp, sum.precip,
sd.precip_log, n.consonants_log, n.vowels_log, mean.length,
p.complexity.bias, scaled.LDT.TTR, morphological.complexity,
mean.aoa, lang.family, country) %>%
gather(lang_measure, lang_value,
which(is.element(names(.), lang_vars_crit))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars_crit)))
is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite)
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
!is.na(lang_value))
# get model fits
d.model.fits = d.scatter.fam %>%
group_by(lang_measure, pop_measure) %>%
do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) +
(1|country), data=.))) %>%
filter(term == "pop_value") %>%
mutate(sig = ifelse(abs(statistic) > 1.96, "*", "")) %>%
mutate(sig.col = ifelse(statistic > 1.96, "pos",
ifelse(statistic < -1.96, "neg",
"none"))) %>%
mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
ungroup
d.model.fits[d.model.fits == "NaN"] = "NA"
# plot
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
geom_rect(data = d.model.fits, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point(size = .3) +
geom_smooth(method = "lm", color = "green") +
facet_grid(lang_measure ~ pop_measure, scales = "free") +
scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
theme_bw() +
xlab("Demographic variables") +
ylab("Language variables") +
theme(legend.position = "none")
# get demographic variables only (excluding L2 because intersections contains only 40 languages)
d.demo = d.clean %>%
select(lat, mean.temp, sd.temp, sum.precip, sd.precip_log,
area_log, pop_log, n.neighbors_log) %>%
mutate(lat = abs(lat))
is.na(d.demo) <- sapply(d.demo, is.infinite)
# do pca
pca = prcomp(na.omit(d.demo), scale = TRUE)
# look at eigen values
barplot(pca$sdev/pca$sdev[1])
fviz_eig(pca, addlabels=TRUE,
linecolor ="red", geom = "line") +
theme_bw()
# plot loadings
pcs <- cbind(names(d.demo),
gather(as.data.frame(pca$rotation),
pc, value))
names(pcs)[1] = "variable"
pcs$variable = factor(pcs$variable , levels = pcs$variable[1:8]) # order variables
pcs$magnitude = ifelse(abs(pcs$value)>.2, TRUE, FALSE)
ggplot(pcs) +
geom_bar(aes(x = variable, y = value,
fill = magnitude), stat="identity") +
facet_wrap(~pc) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(name = "Magnitude",
values = c("grey","red1")) +
ggtitle("PCA loadings")
# merge PCAs with full dataset for model fits
d.demo[!is.na(d.demo)] = 0
d.demo[is.na(d.demo)] = 1
good_is = which(rowSums(d.demo) == 0)
d.pca = d.clean[good_is,] %>%
cbind(pca$x)
names(d.pca)[names(d.pca) == "PC1"] = "wet_and_hot_climate"
names(d.pca)[names(d.pca) == "PC2"] = "small"
names(d.pca)[names(d.pca) == "PC3"] = "desert_climate"
Principle components by language
ggplot(d.pca) +
borders("world", colour="gray50", fill="gray50") +
geom_point(aes(x = lon, y=lat,
color = wet_and_hot_climate), size=3) +
scale_colour_gradient(high="green", low = "white") +
ggtitle("Wet and hot (PC1)") +
mapTheme
ggplot(d.pca) +
borders("world", colour="gray50", fill="gray50") +
geom_point(aes(x = lon, y=lat,
color = small), size=3) +
scale_colour_gradient(high="blue", low = "white") +
ggtitle("small (PC2)") +
mapTheme
ggplot(d.pca) +
borders("world", colour="gray50", fill="gray50") +
geom_point(aes(x = lon, y=lat,
color = desert_climate), size=3) +
scale_colour_gradient(high="red", low = "white") +
ggtitle("Desert climate (PC3)") +
mapTheme
Model fits using PCA
demo_vars_crit2 = c("wet_and_hot_climate", "small", "desert_climate")
lang_vars_crit2 = c("morphological.complexity", "p.complexity.bias","scaled.LDT.TTR",
"mean.length", "mean.aoa",
"n.consonants_log", "n.vowels_log")
d.scatter.fam = d.pca %>%
select(n.consonants_log, n.vowels_log,
mean.length, p.complexity.bias, scaled.LDT.TTR, morphological.complexity, mean.aoa,
lang.family, country, wet_and_hot_climate, small,
desert_climate) %>%
gather(lang_measure, lang_value,
which(is.element(names(.), lang_vars_crit2))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars_crit2)))
is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite)
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
!is.na(lang_value))
# get model fits
d.model.fits = d.scatter.fam %>%
group_by(lang_measure, pop_measure) %>%
do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) + (1|country), data=.))) %>%
filter(term == "pop_value") %>%
mutate(sig.col = ifelse(statistic > 1.96, "pos",
ifelse(statistic < -1.96, "neg",
"none"))) %>%
mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
ungroup
d.model.fits[d.model.fits == "NaN"] = "NA"
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
geom_rect(data = d.model.fits, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point(size = .3) +
geom_smooth(method = "lm", color = "green") +
facet_grid(lang_measure~pop_measure, scales = "free") +
scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
theme_bw() +
xlab("Demographic variables") +
ylab("Language variables") +
theme(legend.position = "none")
ld.demo.qual %>%
mutate(morphological.complexity.factor = as.factor(abs(morphological.complexity))) %>%
group_by(morphological.complexity.factor) %>%
multi_boot(column="pop_log",
summary_groups = c("morphological.complexity.factor"),
statistics_functions = c("mean", "ci_lower","ci_upper")) %>%
mutate(morphological.complexity = as.numeric(as.character(morphological.complexity.factor))) %>%
ggplot() +
geom_pointrange(aes(x= morphological.complexity.factor,
y = mean, ymin = ci_lower, ymax = ci_upper)) +
geom_smooth(method = "lm",aes(morphological.complexity-5, mean)) +
# theres a weird offset because complexity is a factor?
ylab("Log population") +
xlab("Morphological complexity") +
ggtitle("Lupyan and Dale (2010) Fig. 3 reproduction") +
theme_bw()
SHINY APP
lmeasure <- "lexical.diversity"
pmeasure <- "log.RatioL2"
subdata <- filter(d.scatter.fam2,
lang_measure == "lmeasure", pop_measure == "pmeasure")
shinyApp(
ui = fluidPage(
selectInput("lmeasure", "Language Measure:",
choices = as.character(unique(d.scatter.fam2$lang_measure)),
selected = "lexical.diversity"),
selectInput("pmeasure", "Population Measure:",
choices = as.character(unique(d.scatter.fam2$pop_measure)),
selected = "log.RatioL2"),
plotOutput("scatterPlot"),
verbatimTextOutput("lmer")
),
server = function(input, output) {
output$scatterPlot <- renderPlot({
qplot(pop_value, lang_value,
data = filter(d.scatter.fam2, lang_measure == input$lmeasure &
pop_measure == input$pmeasure)) +
geom_smooth(method="lm") +
theme_bw()
})
output$lmer <- textOutput({
summary(lmer(lang_value ~ pop_value + (pop_value | langFamily),
data = filter(d.scatter.fam2,
lang_measure == input$lmeasure &
pop_measure == input$pmeasure)))
})
},
options = list(height = 500)
)
get measures of developmental change in vocabulary
Predict AOA by social variables, by social variable
aoa_central = aoa_data %>%
mutate(language = tolower(language)) %>%
group_by(language, measure, lexical_category) %>%
summarise(mean_aoa = mean(aoa))
d3 = inner_join(d, aoa_central, by = "language") %>%
select(language, log.RatioL2, log.pop2,
log.numNeighbors, log.area,
mean.temp, mean_aoa, measure, lexical_category) %>%
mutate(language = as.factor(language)) %>%
gather(pop_measure, pop_value, 2:6)
ggplot(d3, aes(y = mean_aoa,
x = pop_value,
label = language,
group = measure,
color = measure)) +
geom_text(size = 3, aes(group = measure)) +
facet_grid(lexical_category ~ pop_measure, scales = "free") +
geom_smooth(method = "lm") +
theme_bw()
Fits controlling for language family (mean.fam.lang ~ mean.fam.pop) Aggregating across families
d.scatter.fam = d %>%
select(log.n.consonants, log.n.vow, son, log.asjp.mean.length,
H.LDT, lun.mean.aoa, p.complexity.bias, m_obs,
log.RatioL2, log.pop2, growingSeason, mean.temp,
log.sum.precip, log.area, langFamily) %>%
rename(log.length = log.asjp.mean.length, lexical.diversity = H.LDT,
aoa = lun.mean.aoa, complexity.bias = p.complexity.bias,
dependency.length = m_obs, log.n.vowels = log.n.vow) %>%
group_by(langFamily) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
gather(lang_measure, lang_value,
which(is.element(names(.), lang_vars))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars)))
is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite)
# get correlations
d.scatter.corrs = d.scatter.fam %>%
group_by(lang_measure, pop_measure) %>%
do(tidy(cor.test(~ .$pop_value + .$lang_value))) %>%
mutate(sig = ifelse(p.value < .05, "*", "")) %>%
mutate(sig.col = ifelse(p.value < .05 & estimate > 0, "pos",
ifelse(p.value < .05 & estimate < 0, "neg",
"none"))) %>%
mutate(lab = paste("r = ", round(estimate,2),sig, sep = "")) %>%
mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
full_join(text.pos) %>%
ungroup
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
geom_rect(data = d.scatter.corrs, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point(size = .3) +
geom_smooth(method = "lm", color = "green") +
facet_grid(lang_measure~pop_measure, scale = "free") +
scale_fill_manual(name = "grp",
values = c( "mediumblue", "grey99","red1")) +
theme_bw() +
xlab("Demographic variables") +
ylab("Language variables") +
theme(legend.position = "none")
Fits controling for family – controling for family in a linear model (lm(lang ~ pop + lang.family))
d.scatter.fam2 = d %>%
filter(log.pop2>2) %>%
select(log.n.consonants, log.n.vow, son, log.asjp.mean.length,
H.LDT, lun.mean.aoa, p.complexity.bias, m_obs,
log.RatioL2, log.pop2, growingSeason, mean.temp,
log.sum.precip, log.area,langFamily) %>%
rename(log.length = log.asjp.mean.length, lexical.diversity = H.LDT,
aoa = lun.mean.aoa, complexity.bias = p.complexity.bias,
dependency.length = m_obs, log.n.vowels = log.n.vow) %>%
gather(lang_measure, lang_value,
which(is.element(names(.), lang_vars))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars)))
is.na(d.scatter.fam2) <- sapply(d.scatter.fam2, is.infinite)
d.scatter.fam2 <- filter(d.scatter.fam2, !is.na(pop_value), !is.na(lang_value))
# get correlations
d.scatter.corrs = d.scatter.fam2 %>%
group_by(lang_measure, pop_measure) %>%
do(tidy(lm(lang_value ~ pop_value * langFamily, data = .))) %>%
filter(term == "pop_value") %>%
mutate(sig.col = ifelse(p.value < .05 & estimate > 0, "pos",
ifelse(p.value < .05 & estimate < 0, "neg",
"none"))) %>%
mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
ungroup
ggplot(d.scatter.fam2, aes(x = pop_value, y = lang_value)) +
geom_rect(data = d.scatter.corrs, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point(size = .3, aes(col = langFamily)) +
geom_smooth(method = "lm", color = "green") +
facet_grid(lang_measure~pop_measure, scales = "free") +
scale_fill_manual(name = "grp",
values = c( "mediumblue", "grey99","red1")) +
theme_bw() +
xlab("Demographic variables") +
ylab("Language variables") +
theme(legend.position = "none")
# these are the variables we care about for the paper
demo_vars_crit = c("n.neighbors_log",
"mean.temp", "sum.precip", "sd.temp",
"pop_log", "area_log", "n.neighbors_log",
"sd.precip_log", "ratio.L2.L1_log",
"distance.from.origin")
lang_vars_crit = c("p.complexity.bias","scaled.LDT.TTR",
"mean.dependency.length",
"mean.length", "mean.aoa",
"n.consonants_log", "n.vowels_log", "morphological.complexity")
d.scatter = d.clean %>%
gather(lang_measure, lang_value,
which(is.element(names(.), lang_vars_crit))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars_crit))) %>%
select(pop_measure, lang_measure, pop_value, lang_value)
is.na(d.scatter) <- sapply(d.scatter, is.infinite)
# get correlations
d.scatter.corrs = d.scatter %>%
group_by(lang_measure, pop_measure) %>%
do(tidy(cor.test(~ .$pop_value + .$lang_value))) %>%
mutate(sig.col = ifelse(p.value < .05 & estimate > 0, "pos",
ifelse(p.value < .05 & estimate < 0, "neg",
"none"))) %>%
mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
ungroup
ggplot(d.scatter, aes(x = pop_value, y = lang_value)) +
geom_rect(data = d.scatter.corrs, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point(size = .3) +
geom_smooth(method = "lm", color = "green") +
facet_grid(lang_measure~pop_measure, scale = "free") +
scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
theme_bw() +
xlab("Demographic variables") +
ylab("Language variables") +
theme(legend.position = "none")